Loading Library

library(RColorBrewer)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.0
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(readr)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(tidyverse)
library(stringr)
library(dplyr)
library(formattable)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:formattable':
## 
##     style
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)
library(htmltools)
library(htmlwidgets)
bin062 <- read_csv("bin062_topfiftyhits.txt", 
                   col_names = c("qseqid", "sseqid", "evalue", "staxids", "sscinames", "sskingdoms", "stitle"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 7028 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): qseqid, sseqid, staxids, sscinames, sskingdoms, stitle
## dbl (1): evalue
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bin062_eval <- bin062 %>% 
  filter(as.numeric(evalue) <= 1e-10)
bin062_kingdomcount <- bin062_eval %>% 
  count(sskingdoms)
bin062_kingdomcount
## # A tibble: 4 × 2
##   sskingdoms     n
##   <chr>      <int>
## 1 Archaea       62
## 2 Bacteria     776
## 3 Eukaryota    942
## 4 Viruses       69
piechart_bin062 <- ggplot(bin062_kingdomcount, aes(x="", y=n, fill=sskingdoms)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  scale_fill_brewer(palette = "PiYG") +
   theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid  = element_blank())
piechart_bin062

bin062_rmduplicates <- bin062_eval %>% 
  group_by(qseqid) %>% 
  distinct(sseqid, .keep_all = TRUE)
bin062_eukaryotes <- bin062_rmduplicates %>%
  filter(sskingdoms == "Eukaryota") %>%
  group_by(qseqid, sseqid, staxids) %>%
  slice_min(order_by = evalue) %>%
  ungroup() %>% 
  mutate(genus = word(sscinames, 1))
bin062_eukaryotes
## # A tibble: 552 × 8
##    qseqid             sseqid         evalue staxids sscin…¹ sskin…² stitle genus
##    <chr>              <chr>           <dbl> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 Contig24240559_114 emb|CAH9114… 7.14e-18 186058  Cuscut… Eukary… unnam… Cusc…
##  2 Contig24240559_114 emb|CAI9287… 2.77e-18 75948   Lactuc… Eukary… unnam… Lact…
##  3 Contig24240559_114 emb|VAI3877… 9.77e-18 4567    Tritic… Eukary… unnam… Trit…
##  4 Contig24240559_114 emb|VAI3877… 1.03e-17 4567    Tritic… Eukary… unnam… Trit…
##  5 Contig24240559_114 emb|VAI3878… 2.5 e-18 4567    Tritic… Eukary… unnam… Trit…
##  6 Contig24240559_114 emb|VAI3878… 9.37e-18 4567    Tritic… Eukary… unnam… Trit…
##  7 Contig24240559_114 gb|KAF33316… 5.24e-19 544730  Carex … Eukary… histo… Carex
##  8 Contig24240559_114 gb|KAF36610… 1.08e-17 4072    Capsic… Eukary… Histo… Caps…
##  9 Contig24240559_114 gb|KAF81024… 1.11e-17 3728    Sinapi… Eukary… hypot… Sina…
## 10 Contig24240559_114 gb|KAF86837… 2.02e-18 1010633 Digita… Eukary… hypot… Digi…
## # … with 542 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
bin062_egenus_counts <- bin062_eukaryotes %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(bin062_egenus_counts)
## # A tibble: 84 × 3
## # Groups:   qseqid, genus [84]
##    qseqid             genus               count
##    <chr>              <chr>               <int>
##  1 Contig24240559_114 Ambrosia                4
##  2 Contig24240559_114 Camelina                3
##  3 Contig24240559_114 Helianthus              7
##  4 Contig24240559_114 Hordeum                 2
##  5 Contig24240559_114 Lolium                  7
##  6 Contig24240559_114 Panicum                 4
##  7 Contig24240559_114 Triticum                9
##  8 Contig24240559_119 Closterium              2
##  9 Contig24240559_119 Parelaphostrongylus     2
## 10 Contig24240559_13  Cyberlindnera           2
## # … with 74 more rows
# Count the number of times each genus appears for a specific protein sequence
bin062_egenus_counts_3 <- bin062_eukaryotes %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 5) # Only include those with more than one count 
print(bin062_egenus_counts_3)
## # A tibble: 14 × 3
## # Groups:   qseqid, genus [14]
##    qseqid             genus       count
##    <chr>              <chr>       <int>
##  1 Contig24240559_114 Helianthus      7
##  2 Contig24240559_114 Lolium          7
##  3 Contig24240559_114 Triticum        9
##  4 Contig24240559_138 Brassica       15
##  5 Contig24240559_149 Trichomonas    12
##  6 Contig24240559_158 Astyanax        9
##  7 Contig24240559_158 Daphnia         6
##  8 Contig24240559_158 Helicoverpa    19
##  9 Contig24240559_158 Oreochromis     7
## 10 Contig24240559_57  Lentinula       6
## 11 Contig24240559_89  Naegleria       6
## 12 Contig61333578_11  Rhizophagus     7
## 13 Contig61333578_17  Arachis         9
## 14 Contig61333578_54  Seminavis      27
distinct_colors <- c("skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue",  "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin")
bin062_eukaryotes_ggplot <- ggplot(bin062_egenus_counts_3, aes(x = genus, y = count)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  theme_minimal() +
  theme(text = element_text(size = 25), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin062_eukaryotes_ggplot

bin062_eukaryotes_ggplot <- ggplot(bin062_egenus_counts, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  scale_fill_manual(values = distinct_colors) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin062_eukaryotes_ggplot

ggplotly(bin062_eukaryotes_ggplot)

Bacteria HGT

bin062_bacteria <- bin062_rmduplicates %>% 
  filter(sskingdoms == "Bacteria") %>% 
   group_by(qseqid, sseqid, staxids) %>%
  slice_min(order_by = evalue) %>%
  ungroup() %>% 
  mutate(genus = word(sscinames, 1))
bin062_bacteria
## # A tibble: 470 × 8
##    qseqid             sseqid         evalue staxids sscin…¹ sskin…² stitle genus
##    <chr>              <chr>           <dbl> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 Contig24240559_118 gb|MBR54275… 4.37e-11 2044939 Clostr… Bacter… S8 fa… Clos…
##  2 Contig24240559_118 gb|MDD31137… 9.59e-14 3018265 Candid… Bacter… hypot… Cand…
##  3 Contig24240559_118 gb|MDD39583… 2.45e-12 3018265 Candid… Bacter… DUF50… Cand…
##  4 Contig24240559_118 gb|MDD49882… 7.69e-14 3018265 Candid… Bacter… hypot… Cand…
##  5 Contig24240559_118 gb|MDD56019… 7.64e-11 3018265 Candid… Bacter… hypot… Cand…
##  6 Contig24240559_119 emb|CAI2158… 6.35e-18 42906   Serrat… Bacter… PPE-r… Serr…
##  7 Contig24240559_119 emb|CAI2158… 5.51e-17 42906   Serrat… Bacter… PPE-r… Serr…
##  8 Contig24240559_119 emb|CUQ4059… 6.06e-17 292800  Flavon… Bacter… PPE-r… Flav…
##  9 Contig24240559_119 emb|SUE8074… 5.17e-21 28901   Salmon… Bacter… PPE-r… Salm…
## 10 Contig24240559_119 gb|AMZ63060… 3.28e-19 28901;… Salmon… Bacter… hypot… Salm…
## # … with 460 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
bin062_bacteria_counts <- bin062_bacteria %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 2) # Only include those with more than one count 
print(bin062_bacteria_counts)
## # A tibble: 36 × 3
## # Groups:   qseqid, genus [36]
##    qseqid             genus               count
##    <chr>              <chr>               <int>
##  1 Contig24240559_118 Candidatus              4
##  2 Contig24240559_119 Salmonella             34
##  3 Contig24240559_149 Wolbachia              10
##  4 Contig24240559_29  Candidatus              3
##  5 Contig24240559_29  Corallococcus           3
##  6 Contig24240559_29  Deltaproteobacteria     4
##  7 Contig24240559_4   Candidatus              8
##  8 Contig24240559_45  Thiomonas               9
##  9 Contig24240559_45  Undibacterium          12
## 10 Contig24240559_48  Echinicola              3
## # … with 26 more rows
bin062_bacteria_ggplot <- ggplot(bin062_bacteria_counts, aes(x = genus, y = count)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  theme_minimal() +
  theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin062_bacteria_ggplot

Viruses

bin062_viruses <- bin062_rmduplicates %>% 
  filter(sskingdoms == "Viruses")
bin062_viruses
## # A tibble: 45 × 7
## # Groups:   qseqid [4]
##    qseqid            sseqid                evalue staxids sscin…¹ sskin…² stitle
##    <chr>             <chr>                  <dbl> <chr>   <chr>   <chr>   <chr> 
##  1 Contig61333578_20 tpg|DAR88359.1|     1.65e-12 38018   Bacter… Viruses MAG T…
##  2 Contig61333578_20 tpg|DAK66064.1|     2.48e-12 2832643 Caudov… Viruses MAG T…
##  3 Contig61333578_35 ref|YP_010778856.1| 1.58e-16 2024608 Bodo s… Viruses DEXDc…
##  4 Contig61333578_35 ref|YP_010789339.1| 1.21e-12 2109587 Moumou… Viruses DEAD-…
##  5 Contig61333578_35 gb|ARF09927.1|      1.62e-12 1977633 Indivi… Viruses super…
##  6 Contig61333578_35 gb|QGR54383.1|      1.92e-12 2682469 Moumou… Viruses hypot…
##  7 Contig61333578_35 gb|AHA45880.1|      2.62e-12 1420594 Hirudo… Viruses helic…
##  8 Contig61333578_35 gb|AVG46838.1|      2.9 e-12 212035  Acanth… Viruses DEAD-…
##  9 Contig61333578_35 gb|AHJ39841.2|      6.97e-12 1461100 Samba … Viruses repli…
## 10 Contig61333578_35 gb|AEJ34322.1|      7.49e-12 212035  Acanth… Viruses repli…
## # … with 35 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms